Pacotes necessários:
#if(!require(PSF)) install.packages("PSF")
#if(!require(timetk)) remotes::install_github("business-science/timetk")
if(!require(EflowStats)) remotes::install_github("USGS-R/EflowStats")
pacotes <- c(
"here",
"usethis",
"data.table",
"HEobs",
# "PSF",
"tidyverse",
"lubridate",
"fs",
"checkmate",
# "xts",
# "hydroGOF",
# "ModelMetrics",
# "forecast",
# "timetk",
"EflowStats",
"NbClust",
"cluster",
"cluster.datasets",
"cowplot",
"clValid",
"ggfortify",
"clustree",
"dendextend",
"factoextra",
"FactoMineR",
"corrplot",
"GGally",
#"ggiraphExtra",
"kableExtra",
"tidymodels",
"bestNormalize",
"ggradar",
"ggpubr"
)
# Carregar os pacotes
easypackages::libraries(pacotes)
Scripts:
source(here('R', 'load-data.R'))
source(here('R', 'utils.R'))
seven_stats_tidy <- readRDS(here("output", "seven_stats.RDS"))
hydrosigns <- select(seven_stats_tidy, -code_stn)
psych::describe(hydrosigns) %>%
relocate(skew, kurtosis) %>%
kable() #%>% kable_styling()
| skew | kurtosis | vars | n | mean | sd | median | trimmed | mad | min | max | range | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| lam1 | 4.2808943 | 21.0038092 | 1 | 87 | 1187.7445977 | 2726.5756901 | 290.760 | 534.8943662 | 328.9444620 | 11.650 | 18863.140 | 18851.490 | 292.3195975 |
| tau2 | -0.3290388 | 0.6411567 | 2 | 87 | 0.3990805 | 0.1055289 | 0.400 | 0.4023944 | 0.1037820 | 0.110 | 0.650 | 0.540 | 0.0113139 |
| tau3 | -0.1447398 | 0.9916048 | 3 | 87 | 0.3731034 | 0.0880448 | 0.370 | 0.3749296 | 0.0741300 | 0.120 | 0.640 | 0.520 | 0.0094394 |
| tau4 | -0.0919163 | 0.7960255 | 4 | 87 | 0.2167816 | 0.0796868 | 0.220 | 0.2180282 | 0.0741300 | -0.010 | 0.470 | 0.480 | 0.0085433 |
| ar1 | -2.6218779 | 7.4538085 | 5 | 87 | 0.9288391 | 0.0785307 | 0.956 | 0.9457746 | 0.0385476 | 0.581 | 0.992 | 0.411 | 0.0084194 |
| amplitude | -0.4038625 | -0.9805854 | 6 | 87 | 0.7027586 | 0.3087472 | 0.770 | 0.7143662 | 0.2816940 | 0.140 | 1.300 | 1.160 | 0.0331012 |
| phase | 1.3537602 | -0.0270891 | 7 | 87 | 89.4883333 | 82.8469966 | 50.494 | 77.7218310 | 12.9935064 | 19.029 | 281.270 | 262.241 | 8.8821304 |
Dados brutos
hydrosigns %>%
pivot_longer(cols = everything(), names_to = "stats", values_to = "valor") %>%
ggplot(aes(x=valor)) +
geom_histogram(fill = "lightblue2", color = "black") +
facet_wrap(~stats, scales = "free_x") +
labs(x = "Value", y = "Frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Dados normalizados
hydrosigns_scaled <- hydrosigns %>%
# normalizados pela media e desv padrao
#scale() %>%
# normalizados entre -1 e 1
mutate_all(scales::rescale, to = c(-1, 1)) %>%
as_tibble()
hydrosigns_scaled_long <- hydrosigns_scaled %>%
pivot_longer(cols = everything(), names_to = "stats", values_to = "valor")
# hydrosigns_scaled_long %>%
# ggplot(aes(x=valor)) +
# geom_histogram(fill = "lightblue2", color = "black") +
# facet_wrap(~stats) +
# labs(x = "Value", y = "Frequency") +
# theme_bw()
ggdensity(hydrosigns_scaled_long,
x = "valor",
add = "mean",
rug = TRUE,
color = "stats",
fill = "stats", alpha = 0.3
)
# boxplot
# hydrosigns_scaled %>%
# pivot_longer(cols = everything(), names_to = "stats", values_to = "valor") %>%
# ggplot(aes(x=valor, y = stats)) +
# geom_boxplot(fill = "lightblue2", color = "black") +
# #facet_wrap(~stats, ncol = 2) +
# #labs(x = "Value", y = "Frequency") +
# theme_bw()
hydrosigns_scaled_long %>%
ggboxplot(x = "stats",
y = "valor",
color = "stats",
#palette =c("#00AFBB", "#E7B800", "#FC4E07"),
add = "jitter",
shape = "stats",
size = 0.5
) #+ coord_flip()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 87 rows containing missing values (geom_point).
Distribuição das assinaturas após normalização dos dados.
data_plot <- hydrosigns_scaled
M <- cor(data_plot)
M %>%
corrplot(type = "upper",
method = "ellipse",
tl.cex = 0.9
)
alpha <- 0.05
res <- corrplot::cor.mtest(
data_plot,
conf.level = 1-alpha
)
corrplot::corrplot(M,
p.mat = res$p,
method = "color",
type = "upper",
sig.level = c(.001, 0.01, alpha),
pch.cex = 1.2,
insig = "label_sig",
pch.col = "green"
#order = "AOE"
)
ggpairs(data_plot) + theme_bw()
data_pca <- hydrosigns_scaled
res.pca <- PCA(data_pca, graph = FALSE)# Visualize eigenvalues/variances
fviz_screeplot(res.pca, addlabels = TRUE)
var <- get_pca_var(res.pca)# Contributions of variables to PC1
c_pc1 <- fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)# Contributions of variables to PC2
c_pc2 <- fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)# Control variable colors using their contributions to the principle axis
vec_pcs <- fviz_pca_var(res.pca, col.var="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
) +
theme_minimal() +
ggtitle("Variables - PCA")
cowplot::plot_grid(c_pc1, c_pc2, vec_pcs, align = "v")
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.
Testando diferentes escolhas para o número de clusters.
data_km <- hydrosigns_scaled
kclusts <-
tibble(k = 1:20) %>%
mutate(
#kclust = map(k, ~kmeans(hydrosigns_trans_wide, .x)),
kclust = map(k, ~kmeans(data_km, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
#augmented = map(kclust, augment, hydrosigns_trans_wide)
augmented = map(kclust, augment, data_km)
)
kclusts
## # A tibble: 20 × 5
## k kclust tidied glanced augmented
## <int> <list> <list> <list> <list>
## 1 1 <kmeans> <tibble [1 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 2 2 <kmeans> <tibble [2 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 3 3 <kmeans> <tibble [3 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 4 4 <kmeans> <tibble [4 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 5 5 <kmeans> <tibble [5 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 6 6 <kmeans> <tibble [6 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 7 7 <kmeans> <tibble [7 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 8 8 <kmeans> <tibble [8 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 9 9 <kmeans> <tibble [9 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 10 10 <kmeans> <tibble [10 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 11 11 <kmeans> <tibble [11 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 12 12 <kmeans> <tibble [12 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 13 13 <kmeans> <tibble [13 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 14 14 <kmeans> <tibble [14 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 15 15 <kmeans> <tibble [15 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 16 16 <kmeans> <tibble [16 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 17 17 <kmeans> <tibble [17 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 18 18 <kmeans> <tibble [18 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 19 19 <kmeans> <tibble [19 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 20 20 <kmeans> <tibble [20 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
clusters <-
kclusts %>%
unnest(cols = c(tidied))
assignments <-
kclusts %>%
unnest(cols = c(augmented))
clusterings <-
kclusts %>%
unnest(cols = c(glanced))
plts_klust2 <-
map(kclusts$k,
function(ik) {
# ik = 3
fviz_cluster(kclusts[["kclust"]][[ik]],
data = data_km,
show.clust.cent = TRUE,
ellipse.type = "convex",
pointsize = 0.8) +
theme_bw() +
ggtitle(paste0("k = ", ik)) +
coord_equal()
}
)
cowplot::plot_grid(plotlist = plts_klust2[1:10], ncol = 3)
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line() +
geom_point()
# seven_stats_wide <- seven_stats_tidy %>%
# pivot_wider(names_from = code_stn, values_from = statistic)
# seven_stats_wide
diss_matrix<- dist(data_km, method = "euclidean", diag=FALSE)
clust_ana <- NbClust(data_km,
diss=diss_matrix,
distance = NULL,
min.nc=2,
max.nc=24,
method = "ward.D2",
index = "all")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 5 proposed 24 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
clust_ana
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 4.6252 76.2062 23.4752 13.9651 640.7244 39166.7785 157.0478 58.4948
## 3 1.2205 59.6538 18.6877 13.2463 702.9625 43093.7989 73.0601 45.8359
## 4 1.3201 54.1931 14.7994 14.5696 768.1084 36231.9002 37.9928 37.4944
## 5 0.7974 50.9703 17.5610 15.7968 864.8869 18612.4368 31.6149 31.8206
## 6 1.2896 52.3744 15.0022 16.7661 945.6863 10588.2444 23.6509 26.2080
## 7 1.3599 53.5598 12.2066 17.5985 1015.9349 6427.5342 16.8239 22.1125
## 8 1.1872 53.9738 11.0003 18.1495 1112.7651 2758.4339 13.5997 19.1852
## 9 1.1877 54.4798 9.8985 18.6662 1198.6059 1301.5464 10.8300 16.8402
## 10 0.9495 54.9581 10.7674 19.1241 1236.8015 1035.8729 7.5015 14.9438
## 11 1.2626 56.7094 9.2906 19.8719 1363.5717 291.9232 6.1630 13.1105
## 12 1.4273 57.9284 7.1872 22.1052 1398.7648 231.8279 4.3538 11.6824
## 13 0.9686 58.0048 7.5670 22.4597 1447.3502 155.6529 3.4536 10.6608
## 14 0.9893 58.7947 7.9289 22.9672 1501.0601 97.3671 2.7434 9.6718
## 15 0.9924 60.2544 8.3852 23.6129 1545.6020 66.9870 2.0474 8.7242
## 16 1.1916 62.4661 7.5747 24.4094 1594.1140 43.6397 1.4439 7.8141
## 17 1.0890 64.3637 7.3831 25.0737 1651.8198 25.3792 1.2290 7.0609
## 18 1.0070 66.4382 7.7483 25.7374 1683.4700 19.7757 0.9769 6.3872
## 19 1.2162 69.2061 6.8667 26.5210 1736.1500 12.0260 0.8077 5.7424
## 20 1.2362 71.4788 5.9547 27.1421 1771.3019 8.8961 0.7057 5.2157
## 21 0.9972 73.1297 6.2288 27.5880 1809.3598 6.3328 0.5622 4.7900
## 22 1.1156 75.3576 5.9111 28.1286 1850.7356 4.3198 0.4517 4.3769
## 23 1.2169 77.5311 5.1697 28.6245 1883.8179 3.2280 0.3892 4.0120
## 24 0.9883 79.1194 5.4271 28.9722 1960.7576 1.4515 0.3401 3.7122
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 125.0244 4.0820 0.2664 0.6980 0.5367 0.7695 20.0696 1.3476 0.3209
## 3 137.3336 5.2094 0.3103 1.2980 0.3891 0.7132 20.5073 1.8007 0.3609
## 4 145.2495 6.3684 0.2539 1.3150 0.4033 0.1141 31.0685 28.3713 0.3617
## 5 157.7350 7.5039 0.2510 1.0022 0.4172 0.5740 10.3916 3.1632 0.3449
## 6 169.7721 9.1109 0.3131 0.8616 0.4268 0.7144 17.9893 1.7856 0.3402
## 7 176.3796 10.7983 0.2607 1.0583 0.3702 0.6105 10.8444 2.7508 0.3216
## 8 188.6396 12.4460 0.2432 0.9411 0.3867 1.5321 -2.0837 -1.3591 0.3072
## 9 220.9753 14.1790 0.3307 0.8789 0.3996 0.4696 18.0703 4.8534 0.2981
## 10 227.9138 15.9784 0.3153 0.8506 0.3585 0.3532 9.1579 6.9691 0.2871
## 11 329.7181 18.2127 0.3413 0.7717 0.3853 0.6781 7.1221 2.0324 0.2784
## 12 340.1906 20.4391 0.3433 0.8237 0.3867 0.6256 15.5612 2.6315 0.2694
## 13 375.6197 22.3978 0.3291 0.8613 0.3357 2.7258 -3.7988 -2.4779 0.2607
## 14 411.5347 24.6882 0.3643 0.8187 0.3517 0.3523 22.0604 7.7482 0.2529
## 15 429.3013 27.3697 0.3440 0.8229 0.3544 0.4493 11.0328 5.0375 0.2459
## 16 445.1417 30.5572 0.3255 0.7878 0.3745 0.4442 5.0049 4.5704 0.2388
## 17 486.0598 33.8172 0.3408 0.7580 0.3729 0.4844 5.3211 4.0493 0.2328
## 18 494.9763 37.3840 0.3282 0.7573 0.3757 2.8525 -1.9483 -2.2239 0.2270
## 19 534.1225 41.5819 0.3391 0.7173 0.3934 0.4632 16.2232 4.9383 0.2219
## 20 543.3145 45.7809 0.3204 0.7358 0.3838 11.9574 -0.9164 -2.0920 0.2170
## 21 560.5969 49.8497 0.3166 0.7040 0.3986 0.3742 5.0169 5.7267 0.2123
## 22 571.5709 54.5543 0.3082 0.6954 0.4072 18.0345 0.0000 0.0000 0.2080
## 23 582.9552 59.5155 0.3060 0.6268 0.4278 16.1311 -1.8760 -2.8552 0.2038
## 24 831.7566 64.3229 0.3019 0.5957 0.4414 0.6007 4.6535 2.6559 0.2000
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 29.2474 0.6820 1.6224 0.2496 0.4137 0.0197 3.9854 0.6710 0.3764
## 3 15.2786 0.6439 0.1317 0.6131 0.1126 0.0214 5.6466 0.6141 0.7463
## 4 9.3736 0.7155 -0.0760 0.6972 0.1126 0.0236 6.7988 0.5647 0.7190
## 5 6.3641 0.7204 0.2854 0.6962 0.1126 0.0235 3.9294 0.5170 0.2821
## 6 4.3680 0.7290 1.5213 0.7177 0.1476 0.0241 3.6715 0.4808 0.2681
## 7 3.1589 0.5853 0.0756 1.3696 0.1343 0.0249 4.8091 0.4390 0.2509
## 8 2.3981 0.5942 -0.0170 1.3571 0.1343 0.0252 4.6210 0.4129 0.2167
## 9 1.8711 0.5983 0.6322 1.3441 0.1862 0.0261 5.0004 0.3937 0.1591
## 10 1.4944 0.5846 0.1291 1.4301 0.1862 0.0272 4.6845 0.3667 0.1384
## 11 1.1919 0.5859 0.4351 1.4271 0.2059 0.0273 4.4218 0.3432 0.1060
## 12 0.9735 0.5723 3.5190 1.5110 0.2247 0.0276 4.6774 0.3257 0.1127
## 13 0.8201 0.4651 0.0667 2.3743 0.1085 0.0283 6.6233 0.3048 0.1063
## 14 0.6908 0.4664 0.4753 2.3534 0.1235 0.0284 6.4531 0.2928 0.0937
## 15 0.5816 0.4493 0.1897 2.5277 0.1235 0.0292 6.3417 0.2767 0.0869
## 16 0.4884 0.4454 0.1265 2.5465 0.1235 0.0293 6.3666 0.2601 0.0759
## 17 0.4153 0.4446 0.1662 2.5347 0.1336 0.0294 6.4637 0.2511 0.0739
## 18 0.3548 0.4425 0.0910 2.5334 0.1336 0.0296 6.3960 0.2399 0.0680
## 19 0.3022 0.4425 1.0280 2.5160 0.1651 0.0297 6.2660 0.2285 0.0599
## 20 0.2608 0.4010 0.1047 3.0403 0.1651 0.0299 7.6759 0.2136 0.0562
## 21 0.2281 0.4006 0.1772 3.0318 0.1651 0.0301 8.1044 0.2037 0.0479
## 22 0.1989 0.3984 0.0845 3.0367 0.1651 0.0303 8.1269 0.1961 0.0451
## 23 0.1744 0.3983 0.1875 3.0294 0.1857 0.0303 8.0832 0.1863 0.0329
## 24 0.1547 0.3971 0.3873 3.0328 0.2064 0.0304 8.0393 0.1784 0.0285
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7154 26.6504 0.2259
## 3 0.6881 23.1145 0.0860
## 4 0.2524 11.8459 0.0000
## 5 0.5070 13.6158 0.0047
## 6 0.6744 21.7214 0.0895
## 7 0.5401 14.4778 0.0110
## 8 0.3404 11.6263 1.0000
## 9 0.5300 14.1914 0.0001
## 10 0.3011 11.6036 0.0000
## 11 0.5190 13.9039 0.0577
## 12 0.6051 16.9684 0.0130
## 13 0.3404 11.6263 1.0000
## 14 0.4792 13.0421 0.0000
## 15 0.4241 12.2211 0.0001
## 16 0.2524 11.8459 0.0017
## 17 0.3011 11.6036 0.0024
## 18 0.1898 12.8095 1.0000
## 19 0.5070 13.6158 0.0001
## 20 -0.0196 -52.1455 1.0000
## 21 0.1898 12.8095 0.0008
## 22 -0.2283 0.0000 NaN
## 23 0.1049 17.0735 1.0000
## 24 0.3729 11.7706 0.0207
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.0000 24.0000 3.0000 24.0000 11.0000 5.000 3.0000
## Value_Index 4.6252 79.1194 4.7875 28.9722 126.7702 9595.271 83.9877
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 24.0000 12.0000 8.0000 24.0000 2.0000 2.0000
## Value_Index 4.3174 248.8014 -0.2677 0.2432 0.5957 0.5367 0.7695
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 4.0000 3.0000 6.000 2.0000 2.0000
## Value_Index 20.0696 1.3476 0.3617 13.9688 0.729 1.6224 0.2496
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.0000 0 6.0000 0 24.0000
## Value_Index 0.4137 0 3.6715 0 0.0285
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 1 2 1 1 1 1 1
set.seed(31)
# function to compute total within-cluster sum of squares
fviz_nbclust(hydrosigns_scaled,
kmeans,
method = "wss",
k.max = 24
) +
theme_bw() +
ggtitle("the Elbow Method")
gap_stat <- clusGap(hydrosigns_scaled,
FUN = kmeans,
nstart = 30,
K.max = 24,
B = 50)
fviz_gap_stat(gap_stat) +
theme_bw() +
ggtitle("fviz_gap_stat: Gap Statistic")
fviz_nbclust(hydrosigns_scaled,
kmeans,
method = "silhouette",
k.max = 24) +
theme_minimal() +
ggtitle("The Silhouette Plot")
kclusts_ss <- mutate(kclusts,
within_ss = map_dbl(k, ~mean(kclusts[["kclust"]][[.x]][["withinss"]])),
between_ss = map_dbl(k, ~mean(kclusts[["kclust"]][[.x]][["betweenss"]]))
) %>%
select(k, contains("_ss"))
kclusts_ss_long <- kclusts_ss %>%
pivot_longer(cols = -k, names_to = "measure", values_to = "valor")
kclusts_ss_long %>%
filter(k > 1) %>%
ggplot(., aes(x=k, y=log10(valor), fill = measure)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Cluster Model Comparison") +
xlab("Number of Clusters") +
ylab("Log10 Total Sum of Squares")
#scale_x_discrete(name = "Number of Clusters", limits = c("0", "2", "3", "4", "5", "6", "7", "8"))
O pacote NbClust fornece 30 índices para determinar o o número relevante de clusters propõ ao usuário o melfor esquema de agrupamento dos diferentes resultados obtidos da variação de todas combinações de nº de cluesters, medidas de distância e métodos de agrupamentos.
nbc_scaled <- NbClust(hydrosigns_scaled,
distance = "euclidean",
min.nc = 2,
max.nc = 25,
#method = "complete",
method = "ward.D2",
index ="all"
)
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 2 proposed 24 as the best number of clusters
## * 3 proposed 25 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
factoextra::fviz_nbclust(nbc_scaled) +
theme_bw() +
ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 2 proposed 24 as the best number of clusters
## * 3 proposed 25 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
nbc_scaled2 <- NbClust(hydrosigns_scaled,
distance = "euclidean",
min.nc = 2,
max.nc = 10,
method = "complete",
index = "all"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 4 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
factoextra::fviz_nbclust(nbc_scaled2) +
theme_bw() +
ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 5 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 4 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 4 .
tmp <- NULL
for (k in 1:15){
tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30)
}
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
df <- data.frame(tmp)# add a prefix to the column names
colnames(df) <- seq(1:15)
colnames(df) <- paste0("k", colnames(df))# get individual PCA
df.pca <- prcomp(df, center = TRUE, scale. = FALSE)
ind.coord <- df.pca$x
ind.coord <- ind.coord[,1:2]
df <- bind_cols(as.data.frame(df), as.data.frame(ind.coord))
clustree(df, prefix = "k")
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
tmp <- as.data.frame(hydrosigns_scaled)
rownames(tmp) <- seven_stats_tidy$code_stn
intern <- clValid(tmp,
nClust = 2:24,
clMethods = c("hierarchical","kmeans","pam"),
validation = "internal")# Summary
summary(intern) %>% kable() %>% kable_styling()
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
##
## hierarchical Connectivity 2.9290 2.9290 8.0337 13.0651 16.8119 19.6298 23.7238 29.2992 36.3036 40.5282 42.6948 47.7702 54.1623 56.6623 58.6956 63.2123 64.9623 67.8913 69.9841 72.2619 78.5302 84.6599 86.9933
## Dunn 0.4037 0.4037 0.2151 0.2162 0.2362 0.2583 0.2583 0.1679 0.2220 0.2220 0.2220 0.2615 0.2924 0.2952 0.2952 0.2952 0.2952 0.2952 0.2952 0.2952 0.2668 0.3536 0.3536
## Silhouette 0.4737 0.5167 0.4918 0.4852 0.4867 0.4333 0.3974 0.4213 0.4639 0.4581 0.4512 0.4137 0.3850 0.3724 0.3710 0.3670 0.3596 0.3278 0.3235 0.3190 0.3216 0.3498 0.3485
## kmeans Connectivity 3.4623 17.8766 23.7595 29.3294 32.7944 36.3377 45.6552 42.2679 43.3929 51.2139 53.3806 58.4560 66.7734 70.5778 72.6111 76.9734 76.2865 85.2619 87.2381 89.5159 95.7841 96.0341 98.3675
## Dunn 0.2129 0.0299 0.0603 0.0757 0.0815 0.1232 0.0479 0.1544 0.1544 0.1790 0.1790 0.1824 0.1720 0.1720 0.1720 0.1720 0.1985 0.0917 0.0917 0.0917 0.0999 0.1249 0.1249
## Silhouette 0.5353 0.3302 0.4101 0.4245 0.4339 0.4430 0.4105 0.4626 0.4579 0.4155 0.4133 0.3722 0.3627 0.3806 0.3783 0.3762 0.3710 0.3660 0.3680 0.3672 0.3699 0.3666 0.3653
## pam Connectivity 3.4623 18.4782 32.4012 34.1742 41.2405 49.7294 54.8048 57.4694 60.4583 62.0250 65.3052 67.5885 70.3266 76.5948 81.3948 85.7286 86.6869 88.8536 90.8869 98.4313 100.9313 103.2091 104.9591
## Dunn 0.2129 0.0305 0.0484 0.0484 0.0345 0.0702 0.0702 0.0751 0.0751 0.0751 0.0751 0.1047 0.1047 0.1047 0.1047 0.1085 0.1313 0.1313 0.1313 0.1313 0.1355 0.1355 0.1713
## Silhouette 0.5353 0.3231 0.2811 0.3266 0.4030 0.3208 0.2784 0.2999 0.2927 0.3153 0.3303 0.3406 0.3594 0.3621 0.3642 0.3322 0.3388 0.3449 0.3468 0.3454 0.3357 0.3379 0.3433
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 2.9290 hierarchical 2
## Dunn 0.4037 hierarchical 2
## Silhouette 0.5353 kmeans 2
# Compute dissimilarity matrix with euclidean distances
d <- dist(hydrosigns_scaled, method = "euclidean")# Hierarchical clustering using Ward's method
res.hc <- hclust(d, method = "ward.D2" )# Cut tree into 5 groups
grp <- cutree(res.hc, k = 2)# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = 2:5) # add rectangle
res.hc2 <- hclust(d, method = "complete" )# Cut tree into 5 groups
grp <- cutree(res.hc2, k = 3)# Visualize
plot(res.hc2, cex = 0.6) # plot tree
rect.hclust(res.hc2, k = 3, border = 2:6) # add rectangle
# Execution of k-means with k=5
final2 <- kmeans(hydrosigns_scaled, 2, nstart = 30)
fviz_cluster(final2, data = hydrosigns_scaled) + theme_bw() + ggtitle("k = 2")
final4 <- kmeans(hydrosigns_scaled, 4, nstart = 30)
fviz_cluster(final4, data = hydrosigns_scaled) + theme_bw() + ggtitle("k = 4")
as.data.frame(seven_stats_tidy) %>%
mutate(Cluster = nbc_scaled$Best.partition) %>%
group_by(Cluster) %>%
summarise_all("mean") %>%
kable()
| Cluster | code_stn | lam1 | tau2 | tau3 | tau4 | ar1 | amplitude | phase |
|---|---|---|---|---|---|---|---|---|
| 1 | 176.2609 | 1380.3913 | 0.3789855 | 0.3552174 | 0.2002899 | 0.9335507 | 0.8205797 | 48.01441 |
| 2 | 119.3889 | 449.2656 | 0.4761111 | 0.4416667 | 0.2800000 | 0.9107778 | 0.2511111 | 248.47172 |
hydrosigns_df <- as.data.frame(hydrosigns_scaled) %>%
mutate(code_stn = seven_stats_tidy$code_stn)
cluster_pos <- as.data.frame(nbc_scaled$Best.partition) %>%
mutate(code_stn = seven_stats_tidy$code_stn)
colnames(cluster_pos) <- c("cluster", "code_stn")
hydrosigns_final <- inner_join(cluster_pos, hydrosigns_df) %>%
mutate(cluster = factor(cluster))
## Joining, by = "code_stn"
hydrosigns_radar <- hydrosigns_final %>%
select(-code_stn) %>%
group_by(cluster) %>%
mutate_at(vars(-cluster), scales::rescale) %>%
summarise_all(.funs = mean) %>%
ungroup()
hydrosigns_radar %>%
ggradar(
font.radar = "roboto",
grid.label.size = 10, # Affects the grid annotations (0%, 50%, etc.)
axis.label.size = 6.5, # Afftects the names of the variables
group.point.size = 2 # Simply the size of the point
)
table(hydrosigns_final$cluster)
##
## 1 2
## 69 18
ggpairs(hydrosigns_final,
columns = 3:ncol(hydrosigns_final),
mapping = aes(color = factor(cluster), alpha = 0.5),
diag = list(continuous = wrap("densityDiag")),
lower=list(continuous = wrap("points", alpha=0.9)))
# Parallel coordiante plots allow us to put each feature on seperate column and lines connecting each column
ggparcoord(data = hydrosigns_final,
columns = 3:9,
groupColumn = 1,
alphaLines = 0.4,
title = "Parallel Coordinate Plot for the HydroSigns",
scale = "globalminmax",
showPoints = TRUE, ) +
theme(legend.position = "bottom")
hydrosigns_final %>%
pivot_longer(cols = -(cluster:code_stn),
names_to = "stats",
values_to = "valor") %>%
ggplot(aes(x=cluster, y = valor)) +
geom_boxplot(aes(fill = stats), color = "black") +
#coord_flip() +
#facet_wrap(~stats, ncol = 2) +
#labs(x = "Value", y = "Frequency") +
theme_bw()